cd C:\Users\chausman\Dropbox\Legacy_utility_costs\Data_package_to_post
 

//////////////Put crosswalk that Marshall generated into Stata
import excel using data/service_territories/modifieddata/iou_cbg_crosswalk_year_2000.xlsx, ///
	firstrow clear
isid cbg FID
isid cbg SVCTERID
unique FID //1259 utilities
unique cbg //189,547 cbgs
save data/service_territories/modifieddata/iou_cbg_crosswalk_year_2000, replace

import excel using data/service_territories/modifieddata/iou_cbg_crosswalk_year_2019.xlsx, ///
	firstrow clear
rename GEOID cbg
isid cbg FID
isid cbg SVCTERID
unique FID //1259 utilities
unique cbg //196,873 cbgs
save data/service_territories/modifieddata/iou_cbg_crosswalk_year_2019, replace



//////////////Assemble Census data for 2000 and ACS data for 2019
//2000
insheet using data/census_cbgs/nhgis0012_csv/nhgis0012_ds147_2000_blck_grp.csv, comma clear
	rename fy4001 hh_2000
	keep hh* gisjoin
gen cbg=subinstr(gisjoin,"G","",.)
	gen statecode=substr(cbg,1,2)
	gen extra1=substr(cbg,3,1)
		tab extra1
		drop extra1
	gen countycode=substr(cbg,4,3)
	gen extra2=substr(cbg,7,1)
		tab extra2
		drop extra2
	gen censustract=substr(cbg,8,6)
	gen censusblockgroup=substr(cbg,14,1)
	replace cbg=statecode+countycode+censustract+censusblockgroup
	drop state county census* gisjoin
	order cbg
//Merge in CBG/IOU crosswalk: NOTE year specific
merge 1:m cbg using data/service_territories/modifieddata/iou_cbg_crosswalk_year_2000
	//_m=1: in Census data but doesn't match to an IOU (fine - we know we haven't covered all of US)
	//_m=2: IOU with no Census data (bad) - there are none
	//_m=3: good
//not currently using cbgareaweight because it does not sum to 1 within a cbg
	duplicates tag cbg, gen(dup)
	//br cbg pop_2000 SVCTERID NAME cbgarea if dup==9
	//br cbg pop_2000 SVCTERID NAME cbgarea if dup==7
	gen weight=1/(dup+1)
		unique cbg //208790
		sum weight
		disp r(mean)*r(N) //208790, good
	//checking we have whole US and nothing additional
		gen temp=hh_2000*weight
		sum temp
			local temp1=r(mean)*r(N)/10^6
			disp `temp1' //versus google says 105 - good	
		sum temp if _m==3
			local temp2=r(mean)*r(N)/10^6		
			disp `temp2'/`temp1' //91% of US households have matched to a utility -- good
//Collapse to utility
keep if _m==3
	drop _m
collapse (sum) hh* [aweight=weight], by(FID SVCTERID NAME)	
save data/census_cbgs/cbgs_2000, replace


//2019
insheet using data/census_cbgs/nhgis0017_csv/nhgis0017_ds244_20195_2019_blck_grp.csv, comma clear
	rename alvze001 hh_2019
	keep hh* gisjoin
save data/census_cbgs/hh_2019, replace

use data/census_cbgs/hh_2019, clear
gen cbg=subinstr(gisjoin,"G","",.)
	gen statecode=substr(cbg,1,2)
	gen extra1=substr(cbg,3,1)
		tab extra1
		drop extra1
	gen countycode=substr(cbg,4,3)
	gen extra2=substr(cbg,7,1)
		tab extra2
		drop extra2
	gen censustract=substr(cbg,8,6)
	gen censusblockgroup=substr(cbg,14,1)
	replace cbg=statecode+countycode+censustract+censusblockgroup
	drop state county census* gisjoin
	order cbg
//Merge in CBG/IOU crosswalk: NOTE year specific
merge 1:m cbg using data/service_territories/modifieddata/iou_cbg_crosswalk_year_2019
	//_m=1: in Census data but doesn't match to an IOU (fine - we know we haven't covered all of US)
	//_m=2: IOU with no Census data (bad) - there are none
	//_m=3: good
//not currently using cbgareaweight because it does not sum to 1 within a cbg
	duplicates tag cbg, gen(dup)
	//br cbg pop_2000 SVCTERID NAME cbgarea if dup==9
	//br cbg pop_2000 SVCTERID NAME cbgarea if dup==7
	gen weight=1/(dup+1)
		unique cbg //220333
		sum weight
		disp r(mean)*r(N) //220333, good
	//checking we have whole US and nothing additional
		gen temp=hh_2019*weight
		sum temp
			local temp1=r(mean)*r(N)/10^6
			disp `temp1' //versus google says 129 - good	
		sum temp if _m==3
			local temp2=r(mean)*r(N)/10^6		
			disp `temp2'/`temp1' //90% of US population have matched to a utility -- good 
//Collapse to utility
keep if _m==3
	drop _m
collapse (sum) hh* [aweight=weight], by(FID SVCTERID NAME)	
save data/census_cbgs/cbgs_2019, replace


///////////
//Combine utility-level identifier and utility-level demographics data
//because Marshall didn't keep the EIA identifier; he kept a different unique identifier 
shp2dta using data/service_territories/originaldata/Natural_Gas_Service_Territories/NG_Service_Terr, ///
	database(data/service_territories/originaldata/Natural_Gas_Service_Territories/database_ng) ///
	coordinates(data/service_territories/originaldata/Natural_Gas_Service_Territories/coord_ng) ///
    genid("myID") replace
	

////////////////////
//Merge all
use data/service_territories/originaldata/Natural_Gas_Service_Territories/database_ng, clear
keep FID SVCTERID NAME COMPID LDC_STATE	RES_CUST
merge 1:1 FID using data/census_cbgs/cbgs_2000, nogen
merge 1:1 FID using data/census_cbgs/cbgs_2019, nogen	
gen company_id_eia=COMPID+LDC_STATE
	drop FID SVCTERID COMPID LDC_STATE
	rename NAME name_service_territories
	rename RES_CUST resi_cust_snapshot_serv_terr
	replace resi_cust_snapshot_serv_terr=. if resi_cust_snapshot_serv_terr<0
duplicates tag company_id_eia, gen(dup)
	drop if dup!=0 //just a few
	drop dup
save data/census_cbgs/utility_demographics, replace


/////////////////
//Bring into main dataset and analyze
use data/legacy_utility_data_controls, clear
merge m:1 company_id_eia using data/census_cbgs/utility_demographics
	sum resi_cust1 if _m!=1
		local temp1=r(mean)*r(N)
		sum resi_cust1
		local temp2=r(mean)*r(N)
		disp `temp1'/`temp2' //87% of our residential customers appear in a service territory
	tab year iou if _m==1	//weighted twoards the front end, not surprisingly - utility ids that were gone by the time LBNL generated the shapefiles	
		//not that utility disappeard necessarily; could be that EIA changed the identifier because eg merger
	list year name_service_territories resi_cust_snapshot if _m==2 ///
		& resi_cust_snapshot!=. & resi_cust_snapshot!=0 //only three. 
	drop if _m==2
	drop _m
//verifying that our residential counts match the shapefiles ones:
corr resi_cust_snapshot resi_cust1 if year==2017 //0.999 correlation
sum resi_cust_snapshot resi_cust1 if year==2017 //comparable means and standard deviations
//we have service territories and accompanying demographics for 87 percent of gas users:
sum resi_cust1 if hh_2000!=.
	local temp1=r(mean)*r(N)/10^6
sum resi_cust1 
	local temp2=r(mean)*r(N)/10^6	
	disp `temp1'/`temp2' 

keep if year==2000 | year==2019
keep year company_id_eia company_name_eia state_abbr resi_total_custom *2000 *2019 portion_resi
reshape wide resi_total_custom portion_resi, i(company_id_eia company_name_eia state) j(year)
keep if state_a!="AK" & state_a!="HI"

gen custchange=(resi_total_custom2019)-(resi_total_custom2000)
gen hhchange=(hh_2019)-(hh_2000)
label variable hhchange "Household count change in service territory"

//TABLE FOR APPENDIX
reg custchange hhchange if portion_resi_sales2000>.9 & portion_resi_sales2019>.9 
estimates store v1

esttab v1 ///
	using householdcounts.tex, replace fragment ///
 	cells(b(fmt(%12.2f) star) se(par fmt(%12.2f)))	///
	stats(N r2, fmt(%9.0fc %9.2f) ///
		labels("Observations" "R$^{2}$" ))	///
	mlabel("Change in Residential Customer Count") ///
	label lz varwidth(16) modelwidth(13) style(tex) ///
	posthead("\\ \hline \\") prefoot("\\ \hline \\") ///
	collabels(,none) nonumbers	///
	coeflabels(_cons "Constant") /// 
	starlevels(* 0.1 ** 0.05 *** 0.01)

